#-----------------------------------#
# Replication of:
# Seeking Opportunity in the Knowledge Economy: Moving Places, Moving Politics?
# Valentina Consiglio, Thomas Kurer

# File: 3_dataprep

# Note: This file takes the following steps:
# (A) Recodes all relevant variables from the SOEP
# (B) Merges it with the opportunity index data and codes additional index-based variables
# (C) Re-weights opportunity index for analysis at labor market region (LMR) level
# (D) Applies the final sample selection

#-----------------------------------#

# Clear workspace
rm(list=ls())

#### Load Packages ####

library(pacman)

p_load(car, arm, lmtest, vars, ggplot2, stargazer, tidyverse, dplyr, readxl,
       haven, labelled, sjlabelled, patchwork, survey, plm, zoo, data.table)

#-------------------------------------------------------------------------------# 

#### Define relevant paths ####
orig_path <- "data_original/"
data_path <- "data_created/"


#-------------------------------------------------------------------------------# 
#### A | Modify SOEP data ####
#-------------------------------------------------------------------------------# 

#### Import data #### 

# Merged SOEP
soep <- read_dta(paste0(data_path, 'soepregio_ext.dta'))

glimpse(soep)


#-------------------------------------------------------------------------------# 
#### 1 | (Re)code Socio-Demographics ####
#-------------------------------------------------------------------------------# 

## Age ##
soep$age <- soep$syear - soep$gebjahr

# Relevant age groups
soep <- soep %>%
  mutate(age_grp = case_when(age %in% c(18:25) ~ 1,
                             age %in% c(26:35) ~ 2,
                             age %in% c(36:45) ~ 3,
                             age %in% c(46:55) ~ 4,
                             age %in% c(56:65) ~ 5,
                             age %in% c(66:70) ~ 6
  )) 

val_labels(soep$age_grp) <- c("18-25 yrs" = 1, "26-35 yrs" = 2, "36-45 yrs" = 3,
                                "46-55 yrs" = 4, "56-65 yrs" = 5, "66-70 yrs" = 6)
# // only grouped those ages that are included in restricted sample in script 5_

# More aggregated groups #
soep <- soep %>%
  mutate(age_grp3 = case_when(age %in% c(18:29) ~ 1,
                              age %in% c(30:49) ~ 2,
                              age %in% c(50:70) ~ 3
  )) 

val_labels(soep$age_grp3) <- c("18-29 yrs" = 1, "30-49 yrs" = 2, "50-70 yrs" = 3)

## Gender ##

# need to be recoded as numeric in order to recode as haven_labelled
soep$sex <- as.numeric(soep$sex)
val_labels(soep$sex) <- c("Male" = 1, "Female" = 2)

soep <- soep %>%
  mutate(female = case_when(sex == 2 ~ 1,
                            sex == 1 ~ 0))

## Region (east/west) ##

# Recode as east if Berlin (3), Brandenburg (4),  
# Mecklenburg-Vorpommern (8), Sachsen (13), Sachsen-Anhalt (14), Thueringen (16)

soep <- soep %>%
  mutate(west = case_when(nuts1 %in% c(1, 2, 5, 6, 7, 9, 10, 11, 12, 15) ~ 1, # Western Germany
                          nuts1 %in% c(3, 4, 8, 13, 14, 16) ~ 0 # Eastern Germany 
         )) 

soep <- soep %>%
  mutate(region = case_when(nuts1 %in% c(5, 6, 9, 15) ~ 1, # North (SH, HH, NS, B)
                          nuts1 %in% c(1, 2, 7) ~ 2, # South (H, BW, B)
                          nuts1 %in% c(10, 11, 12) ~ 3, # West (NRW, RP, SL)
                          nuts1 %in% c(3, 4, 8, 13, 14, 16) ~ 4 # East (B, BB, MV, S, SA, T)
  ))


# Assign value labels
val_labels(soep$west) <- c("Western Germany" = 1, "Eastern Germany" = 0)

val_labels(soep$region) <- c("North" = 1, "South" = 2, "West" = 3, "East" = 4)


# Create lagged regional variable # 

# Order data
soep = soep[order(soep$pid, soep$syear), ]

soep <-  soep %>%
  group_by(pid) %>%
  dplyr::mutate(region_lag1 = dplyr::lag(region, n=1, default = NA)) %>%
  ungroup()

soep <-  soep %>%
  group_by(pid) %>%
  dplyr::mutate(west_lag1 = dplyr::lag(west, n=1, default = NA)) %>%
  ungroup()

           
## Martial status ##

soep$marstat <- soep$d11104

# aggregate marital status
soep <- soep %>%
  mutate(marstatag = case_when(marstat == 1 ~ 1, # Married
                                marstat == 2 ~ 2, # Single
                                marstat %in% c(3:5) ~ 3 # Other (including widowed, divorced, separated)
                                ))

# Assign value labels
val_labels(soep$marstatag) <- c("Married" = 1, "Single" = 2, "Other status" = 3)

## Household typology ##

soep$hgtyp1hh <- as.numeric(soep$hgtyp1hh)

soep <- soep %>%
  mutate(hhtype = case_when(hgtyp1hh == 1 ~ 1, # Single-HH
                            hgtyp1hh == 2 ~ 2, # Couple-HH without kids
                            hgtyp1hh == 3 ~ 3, # Single-Parent
                            hgtyp1hh %in% c(4:6) ~ 4, # Couple-HH with kids
                            hgtyp1hh %in% c(7,8) ~ 5, # Other (i.e., multiple generation HH and other combinations)
                           ))

class(soep$hhtype)

soep <- soep %>%
  mutate(hhtype = case_when(hgtyp1hh == 1 ~ 1, # Single-HH
                            hgtyp1hh == 2 ~ 2, # Couple-HH without kids
                            hgtyp1hh == 3 ~ 3, # Single-Parent
                            hgtyp1hh %in% c(4:6) ~ 4, # Couple-HH with kids
                            hgtyp1hh %in% c(7,8) ~ 5, # Other (i.e., multiple generation HH and other combinations)
  ))

class(soep$hhtype)

# Assign value labels
soep$hhtype <- as.numeric(soep$hhtype)
val_labels(soep$hhtype) <- c("Single-HH" = 1, "Couple-HH without kids" = 2, "Single-Parent" = 3, 
                                  "Couple-HH with kids" = 4, "Other HH-type" = 5)

## Number of children ##

soep$nochil <- soep$d11107

# dummy for (no-)children
soep <- soep %>%
  mutate(childd = case_when(nochil > 0 ~ 1, # at least one child
                            nochil == 0 ~ 0, # no children
                            ))

# Assign value labels
val_labels(soep$childd) <- c("At least one child" = 1, "No children" = 0)

## Number of HH members ##

soep$nohhmem <- soep$d11106

## Education ##

# pgisced11 used from 2010 onwards

# Code aggregated groups:
# In school == 0; Lower: 1/2; Medium = 3/4; Higher = 5/8
table(soep$pgisced11)

soep <- soep %>%
  mutate(educ = case_when(pgisced11 == 0 ~ 0, # In school
                          pgisced11 %in% c(1,2) ~ 1, # Lower education
                          pgisced11 %in% c(3,4) ~ 2, # Medium education
                          pgisced11 %in% c(5:8) ~ 3 # Higher education
                         ))

# Assign value labels
val_labels(soep$educ) <- c("In school" = 0, "Lower education" = 1, "Medium education" = 2,
                             "Higher education" = 3)

# Code also isced97 for years <2011

# In school: 0, Low: 1,2, Medium 3,4,, High: 5,6

soep <- soep %>%
  mutate(educ_h = case_when(pgisced11 == 0 | pgisced97 == 0 ~ 0, # In school
                          pgisced11 %in% c(1,2) | pgisced97 %in% c(1,2) ~ 1, # Lower education
                          pgisced11 %in% c(3,4) | pgisced97 %in% c(3,4) ~ 2, # Medium education
                          pgisced11 %in% c(5:8) | pgisced97 %in% c(5,6) ~ 3 # Higher education
  ))

# Assign value labels
val_labels(soep$educ_h) <- c("In school" = 0, "Lower education" = 1, "Medium education" = 2,
                           "Higher education" = 3)


# dummy college degree / higher education
soep <- soep %>%
  mutate(colleged = case_when(educ == 3 ~ 1, # College / higher education
                              educ %in% c(1,2) ~ 0 # Lower and medium education
                              ))

# Assign value labels
val_labels(soep$colleged) <- c("College degree" = 1, "No college degree" = 0)


# Identify those who change education within panel and create new variable with year in which higher education was obtained
soep$higher_educ <- ifelse(soep$educ_h == 3, 1, 0)

# Group by pid and create dummy that indicates year of HE change
first_he_year <- soep %>% 
  group_by(pid) %>%
  summarize(higher_educ_first = ifelse(any(higher_educ == 1, na.rm = TRUE), min(syear[higher_educ == 1], na.rm = TRUE), 0), 
            .groups = 'drop') %>%
  ungroup()


# Join back to main dataset
soep <- soep %>%
  left_join(first_he_year, by = "pid")

# Create variable that indicates which years to drop
soep <- soep %>%
  mutate(educ_filter = case_when(higher_educ_first > syear & higher_educ_first > 0 ~ 0,
                                 higher_educ_first <= syear ~ 1, 
                                 higher_educ_first == 0  ~ 1
  ))



## (Non-)academic family background ##

soep$fprofedu
soep$mprofedu

# Dummies indicating whether mother/father has studied
# Note: If respondent did not know, coded as 0 (based on assumption that person did not benefit from academic fb)
class(soep$fprofedu)
soep$facad <-  ifelse(soep$fprofedu >= 30 & soep$fprofedu <= 32, 1,
                      ifelse((soep$fprofedu >= 10 & soep$fprofedu <= 28) | soep$fprofedu ==40 | soep$fprofedu == 50 | soep$fprofedu == 0, 0, NA))

soep$macad <-  ifelse(soep$mprofedu >= 30 & soep$mprofedu <= 32, 1,
                      ifelse((soep$mprofedu >= 10 & soep$mprofedu <= 28) | soep$mprofedu ==40 | soep$mprofedu == 50 | soep$fprofedu == 0, 0, NA))


## Code (non-)academic family background

# Dummy non-academic
soep$nacfbg <- ifelse(soep$facad == 0 & soep$macad == 0, 1,
                      ifelse(soep$facad == 1 | soep$macad == 1, 0, NA))

# Assign value labels
class(soep$nacfbg)
val_labels(soep$nacfbg) <- c("Non-academic family" = 1, "Academic family" = 0)

val_labels(soep$nacfbg)

# Dummy academic
soep$acfbg <- ifelse(soep$facad == 1 | soep$macad == 1, 1,
                     ifelse(soep$facad == 0 & soep$macad == 0, 0, NA))

# Assign value labels
class(soep$acfbg)
val_labels(soep$acfbg) <- c("Academic" = 1, "Non-academic" = 0)
val_labels(soep$acfbg)


##  Occupation (ISCO) ##

# Code pgisco08
# 1-digit ISCO
soep <- soep %>%
  mutate(isco08_1d = substr(pgisco08, 1, 1))

# Code pgisco88
# 1-digit ISCO
soep <- soep %>%
  mutate(isco88_1d = substr(pgisco88, 1, 1))

# New 1digit ISCO variable (ISCO88 for 2010/11/12 and ISCO08 from 2013 onwards)
soep <- soep %>%
  mutate(isco_h_1d = ifelse(syear < 2013, isco88_1d, isco08_1d))

soep$isco_h_1d[soep$isco_h_1d == "-"] <- NA
soep$isco_h_1d <- as.numeric(soep$isco_h_1d)

soep <- soep %>%
  group_by(pid) %>%
  dplyr::mutate(isco_h_1d_lag1_ub = dplyr::lag(isco_h_1d, n=1, default = NA),
                occ_chg = case_when(isco_h_1d_lag1_ub != isco_h_1d ~ 1,
                                    isco_h_1d_lag1_ub == isco_h_1d ~ 0,
                                    is.na(isco_h_1d_lag1_ub) ~ 0
                                    
                )
  ) %>%
  as.data.frame() %>%
  ungroup()


## Migration background ##

# 1 = no migback, 2 = direct migback, 3 = indirect migback

# dummy for migback
soep <- soep %>%
  mutate(migbackd = case_when(migback %in% c(2,3) ~ 1, # Direct or indirect migback
                              migback == 1 ~ 0, # No migback
                             ))

# Assign value labels
val_labels(soep$migbackd) <- c("Migration background" = 1, "No migration background" = 0)



#-------------------------------------------------------------------------------# 
#### 2 | (Re)code Labour Market and Income Variables ####
#-------------------------------------------------------------------------------# 

## i11102	Household disposable income ##
soep$dishhinc <- soep$i11102

## i11103	Household labour income ##
soep$hhlabinc <- soep$i11103


## pglabgro	Current Gross Labor Income in Euro ##

# Log
soep$ln_pglabgro <- log(soep$pglabgro)

soep$ln_pglabgro[soep$ln_pglabgro == "-Inf"] <- "NA"

soep$ln_pglabgro[soep$ln_pglabgro == 0] <- NA
soep$ln_pglabgro2 <- log(soep$pglabgro)

## Average monthly household equivalent income (square-root method) ##

soep$income <- ((soep$dishhinc/sqrt(soep$nohhmem))/12) 

# Code 6 income groups relative to median
# poor, low, lower middle, mid middle, upper middle, high

# Calculate median income per year

# Set survey design
ds_svy_design_phrf1 <- survey::svydesign(data = soep, ids = ~pid, weights = ~phrf1)

svy_median_income <- svyby(formula = ~income,
                           by = ~syear,
                           design = ds_svy_design_phrf1,
                           FUN = svyquantile,
                           quantiles = 0.5,
                           na.rm = TRUE
)

# rename income
svy_median_income$income_median <- svy_median_income$income

svy_median_income <- svy_median_income %>%
  select(-c(se.income, income))

# Join with soep
soep <-  left_join(soep, svy_median_income, by = "syear")

soep <- soep %>%
  mutate(income_grp = case_when(income < 0.5*(income_median) ~ 1, # Poor
                                income >= 0.5*(income_median) & income < 0.75*(income_median) ~ 2, # Vulnerable
                                income >= 0.75*(income_median) & income < 1*(income_median) ~ 3, # Lower middle
                                income >= 1*(income_median) & income < 1.5*(income_median) ~ 4, # Mid middle
                                income >= 1.5*(income_median) & income < 2*(income_median) ~ 5, # Upper middle
                                income >= 2*(income_median) ~ 6, # High
  ))


val_labels(soep$income_grp) <- c("Poor (<50% of median)" = 1, 
                                 "Vulnerable (50-75% of median)" = 2, 
                                 "Lower middle (75-100% of median)" = 3,
                                 "Mid middle (100-150% of median)" = 4, 
                                 "Upper middle (150-200% of median)" = 5,
                                 "High (>200% of median)" = 6
)

# Income group clean (no students) #
# Clean income variable without students #
soep$pgstib
# 11 = in Ausbildung (in education)

soep$income_grp_clean <- soep$income_grp

soep$income_grp_clean[soep$pgstib == 11] <- NA 

# Continuous income clean
soep$income_clean <- soep$income

soep$income_clean[soep$pgstib == 11] <- NA


## pgstib	Occupational position ##
is.factor(soep$pgstib)
val_labels(soep$pgstib)

table(soep$pgegp08, soep$pgstib, exclude = NULL)



## pgemplst	Employment status ##

# Assign value labels
soep$pgemplst
val_labels(soep$pgemplst) <- c("Full-time" = 1, "Part-time" = 2, "Apprentice" = 3,
                                "Unregular/marignal employment" = 4, "Not employed" = 5,
                                "Sheltered workshop" = 6)

# Create lagged variable 
soep = soep[order(soep$pid, soep$syear), ]

soep <-  soep %>%
  group_by(pid) %>%
  dplyr::mutate(pgemplst_lag1 = dplyr::lag(pgemplst, n=1, default = NA)) %>%
  ungroup()




soep <- soep %>%
  mutate(emplst_aggr = case_when(pgemplst == 5  ~ 1, # Not employed (here still includes people studying and registered UE, and apprenticeships from later onwards)
                                 pgstib == 12 ~ 2, # Registered unemployed
                                 pgstib == 11  ~ 3, # NE: in Ausbildung, inkl. Weiterbildung, Berufsausbildung, Lehre
                                 pgstib == 13 ~ 4, # in pension 
                                 pgemplst %in% c(1, 2, 3, 4) ~ 5 # employed (including apprenticeship and marginal employment)
  ) 
  )

val_labels(soep$emplst_aggr) <- c("Not employed" = 1, 
                                       "Registered unemployed" = 2,
                                       "In education/training" = 3, 
                                       "In pension" = 4, 
                                       "Employed" = 5
)

# as factor
soep$af_emplst_aggr <- as.factor(soep$emplst_aggr)



## pgegp08	Last reached EGP value ##
soep$pgegp08
soep <- soep %>%
  mutate(egpag = case_when(pgegp08 == 1 ~ 1, # Higher managerial position and professional workers
                           pgegp08 == 2 ~ 2, # Lower managerial position and professional workers
                           pgegp08 %in% c(3,4) ~ 3, # Routine non-manual workers
                           pgegp08 %in% c(5,6,11) ~ 4, # Small self-employed and self-employed farmers
                           pgegp08 == 7 ~ 5, # Manual supervisors
                           pgegp08 == 8 ~ 6, # Skilled manual workers                          
                           pgegp08 %in% c(9,10) ~ 7 # Semi- and unskilled manual workers and agricultural labourers
                          ))

# Assign value labels
val_labels(soep$egpag) <- c("Higher managerial and professional" = 1, 
                              "Lower managerial and professional" = 2, 
                              "Routine non-manual" = 3,
                              "Self-employed (small, farmers)" = 4, 
                              "Manual supervisors" = 5,
                              "Skilled manual" =6,
                              "Semi-/unskilled manual and agricultural" = 7)


# Create extended egpag-variable including unemployed and in education 

soep <- soep %>%
  mutate(egpag_ext = case_when(pgegp08 == 1 ~ 1, # Higher managerial position and professional workers
                               pgegp08 == 2 ~ 2, # Lower managerial position and professional workers
                               pgegp08 %in% c(3,4) ~ 3, # Routine non-manual workers
                               pgegp08 %in% c(5,6,11) ~ 4, # Small self-employed and self-employed farmers
                               pgegp08 == 7 ~ 5, # Manual supervisors
                               pgegp08 == 8 ~ 6, # Skilled manual workers                          
                               pgegp08 %in% c(9,10) ~ 7, # Semi- and unskilled manual workers and agricultural labourers,
                               pgstib == 10 ~ 8, # Not employed
                               pgstib == 12 ~ 9, # Registered unemployed
                               pgstib == 13 ~ 10 # In pension
                               
  ))

val_labels(soep$egpag_ext) <- c("Higher managerial and professional" = 1, 
                              "Lower managerial and professional" = 2, 
                              "Routine non-manual" = 3,
                              "Self-employed (small, farmers)" = 4, 
                              "Manual supervisors" = 5,
                              "Skilled manual" =6,
                              "Semi-/unskilled manual and agricultural" = 7,
                              "Not in employment" = 8,
                              "Unemployed (registered)" = 9,
                              "Pension" = 10
)


# Lag egpag for descriptive analysis
soep <-  soep %>%
  group_by(pid) %>%
  mutate(egpag_ext_lag1 = lag(egpag_ext, n=1, default = NA)) %>%
  ungroup()

val_labels(soep$egpag_ext_lag1)

# Create more aggregated version (dummy)

# recode egpag_ext  as employment status dummy
soep <- soep %>%
  mutate(emplst_d = case_when(egpag_ext %in% c(1, 2, 3, 4, 5, 6, 7) ~ 1, # employed
                              egpag_ext == 8 ~ 2, # not employed
                              egpag_ext == 9 ~ 3, # registered unemployed
                              egpag_ext == 10 ~ 4 # in pension
  ))


val_labels(soep$emplst_d) <- c("Employed" = 1,
                                    "Not in employment" = 2,
                                    "Unemployed (registered)" = 3,
                                    "Pension" = 4
)

# Lag employment status dummy

soep = soep[order(soep$pid, soep$syear), ]

soep <-  soep %>%
  group_by(pid) %>%
  dplyr::mutate(emplst_d_lag1 = dplyr::lag(emplst_d, n=1, default = NA)) %>%
  ungroup()


## Unemloyment ##

soep$ue_d <- ifelse(soep$emplst_d == 3, 1, 0)

## pgexpue	Unemployment experience ##
soep$pgexpue

## pgexpft	Working experience full-time employment ##
soep$pgexpft

# pgexppt	Working experience part-time employment ##
soep$pgexppt


#-------------------------------------------------------------------------------# 
#### 3 | (Re)code Political Outcome Variables ####
#-------------------------------------------------------------------------------# 

#-------------------------------------------------------------------------------# 
##### 3.1 | Party Leaning (yes/no) and party dummies #####
#-------------------------------------------------------------------------------# 

## plh0011_h	Party leaning (yes/no) ##
soep$partyleand <- soep$plh0011_h
val_labels(soep$plh0011_h)
# 1 = Ja, 2 = Nein, 3 = weiß nicht

# Code don't know as NA
soep <- soep %>%
  mutate(partyleand = case_when(plh0011_h == 1 ~ 1,
                                plh0011_h == 2 ~ 0))

val_labels(soep$partyleand) <- c("Yes" = 1, 
                                 "No" = 0)

# Code don't know as no
soep <- soep %>%
  mutate(partyleand2 = case_when(plh0011_h == 1 ~ 1,
                                 plh0011_h == 2 ~ 0,
                                 plh0011_h == 3 ~ 0))

val_labels(soep$partyleand2) <- c("Yes" = 1, 
                                 "No" = 0)

## Party ##
soep$plh0012_h

# Create new variable
soep$partylean <- NA
soep$partylean[soep$plh0012_h == 27] <- 1 # AfD
soep$partylean[soep$plh0012_h == 2] <- 2 # CDU
soep$partylean[soep$plh0012_h == 3] <- 3 # CSU
#soep$partylean[soep$plh0012_h == XX] <- 4 # Die Tier # not available in SOEP
soep$partylean[soep$plh0012_h == 4] <- 5 # FDP
soep$partylean[soep$plh0012_h == 5] <- 6 # Grunen
soep$partylean[soep$plh0012_h == 6] <- 7 # Linke
soep$partylean[soep$plh0012_h == 26] <- 8 # Piraten
soep$partylean[soep$plh0012_h == 1] <- 9 # SPD
soep$partylean[soep$plh0012_h == 7] <- 10 # NPD
soep$partylean[soep$plh0012_h == 13] <- 11 # CDU/CSU
soep$partylean[soep$plh0012_h == 9] <- 12 # SPD/Grunen
soep$partylean[soep$plh0012_h == 16] <- 13 # Grunen/Linke
soep$partylean[soep$plh0012_h == 17] <- 14 # SPD/Linke

# Will be NA: Cross-party combinations except CDU/CSU and SPD/Grunen

val_labels(soep$partylean) <- c("AfD" = 1, 
                                "CDU" = 2,
                                "CSU" = 3,
                                "FDP" = 5,
                                "Grunen" = 6,
                                "Linke" = 7,
                                "Piraten" = 8,
                                "SPD" = 9,
                                "NPD" = 10,
                                "CDU/CSU" = 11,
                                "SPD/Grunen" = 12,
                                "Grunen/Linke" = 13,
                                "SPD/Linke" = 14)



## Recode party dummies including coalitions -- code all as 0 on dummies who do not have any party leaning!##

# SPD or left coalition with SPD (9, 17)
soep$spd_co2 <- NA
soep$spd_co2[soep$plh0012_h == 1 | soep$plh0012_h == 9 | soep$plh0012_h == 17] <- 1 # SPD
soep$spd_co2[soep$plh0012_h != 1 & soep$plh0012_h != 9 & soep$plh0012_h != 17 & is.na(soep$plh0012_h) == FALSE] <- 0 # Other
soep$spd_co2[soep$partyleand == 0] <- 0 # No party leaning

val_labels(soep$spd_co2) <- c("SPD or left coalition" = 1, 
                           "Other" = 0)

# CDU/CSU or right coalition with CDU (21, 30)
soep$cducsu_co2 <- NA
soep$cducsu_co2[soep$plh0012_h == 2 | soep$plh0012_h == 3 | soep$plh0012_h == 13 | soep$plh0012_h == 21 | soep$plh0012_h == 30] <- 1 # CDU / CSU / CDU/CSU
soep$cducsu_co2[soep$plh0012_h != 2  & soep$plh0012_h != 3  & soep$plh0012_h != 13 & soep$plh0012_h != 21 & soep$plh0012_h != 30 & is.na(soep$plh0012_h) == FALSE] <- 0 # Other
soep$cducsu_co2[soep$partyleand == 0] <- 0 # No party leaning

val_labels(soep$cducsu_co2) <- c("CDU/CSU or right coalition" = 1, 
                              "Other" = 0)

# Grüne or left coalition with Greens (9, 16)
soep$greens_co2 <- NA
soep$greens_co2[soep$plh0012_h == 5 | soep$plh0012_h == 9 | soep$plh0012_h == 16] <- 1 # Grüne
soep$greens_co2[soep$plh0012_h != 5 & soep$plh0012_h != 9 & soep$plh0012_h != 16 & is.na(soep$plh0012_h) == FALSE] <- 0 # Other
soep$greens_co2[soep$partyleand == 0] <- 0 # No party leaning

val_labels(soep$greens_co2) <- c("Greens or left coalition" = 1, 
                              "Other" = 0)


# FDP or right coalition
soep$fdp_co2 <- NA
soep$fdp_co2[soep$plh0012_h == 4 | soep$plh0012_h == 14 | soep$plh0012_h == 22| soep$plh0012_h == 25] <- 1 # AfD
soep$fdp_co2[soep$plh0012_h != 4 & soep$plh0012_h != 14 & soep$plh0012_h != 22  & soep$plh0012_h != 25  & is.na(soep$plh0012_h) == FALSE] <- 0 # Other
soep$fdp_co2[soep$partyleand == 0] <- 0 # No party leaning

val_labels(soep$fdp_co2) <- c("FDP or right coalition" = 1, 
                             "Other" = 0)
# AfD or right coalition with AfD (30)
# AfD
table(soep$plh0012_h)
soep$afd_co2 <- NA
soep$afd_co2[soep$plh0012_h == 27 | soep$plh0012_h == 30] <- 1 # AfD
soep$afd_co2[soep$plh0012_h != 27 & soep$plh0012_h != 30  & is.na(soep$plh0012_h) == FALSE] <- 0 # Other
soep$afd_co2[soep$partyleand == 0] <- 0 # No party leaning

val_labels(soep$afd_co2) <- c("AfD or right coalition" = 1, 
                           "Other" = 0)

# Left and left party coalition
soep$linke_co2 <- NA
soep$linke_co2[soep$plh0012_h == 6 | soep$plh0012_h == 16 | soep$plh0012_h == 17] <- 1 # Left
soep$linke_co2[soep$plh0012_h != 6 & soep$plh0012_h != 16 & soep$plh0012_h != 17 & is.na(soep$plh0012_h) == FALSE] <- 0 # Other
soep$linke_co2[soep$partyleand == 0] <- 0 # No party leaning

val_labels(soep$linke_co2) <- c("Linke or left coalition" = 1, 
                             "Other" = 0)

# Centre left 
soep$centre_left2 <- NA
soep$centre_left2[soep$plh0012_h == 1 | soep$plh0012_h == 5] <- 1 # left (SPD, Grüne)
soep$centre_left2[soep$plh0012_h == 9] <- 1 # Combinations: SPD/Grüne)
soep$centre_left2[soep$plh0012_h != 1 & soep$plh0012_h != 5 & 
                 soep$plh0012_h != 9 &
                 is.na(soep$plh0012_h) == FALSE] <- 0 # Other; those who do not have a party leaning at all are coded as NA
soep$centre_left2[soep$partyleand == 0] <- 0 # No party leaning

val_labels(soep$centre_left2) <- c("Centre left party" = 1, 
                                "Other" = 0)


# Centre right (including FDP)
soep$centre_right2 <- NA
soep$centre_right2[soep$plh0012_h == 2 | soep$plh0012_h == 3 |  soep$plh0012_h == 4 ] <- 1 # right (CDU, CSU, FDP)
# Combinations: CDU/CSU (13), CDU/FDP (14), CSU/FDP (22), 
soep$centre_right2[soep$plh0012_h == 13 | soep$plh0012_h == 14 | soep$plh0012_h == 22] <- 1 

soep$centre_right2[soep$plh0012_h != 2 & soep$plh0012_h != 3 & soep$plh0012_h != 4 & 
                  soep$plh0012_h != 13 & soep$plh0012_h != 14  & soep$plh0012_h != 22 & 
                  is.na(soep$plh0012_h) == FALSE] <- 0 # Other; those who do not have a party leaning at all are coded as NA
soep$centre_right2[soep$partyleand == 0] <- 0 # No party leaning

val_labels(soep$centre_right2) <- c("Centre right party" = 1, 
                                 "Other" = 0)


#-------------------------------------------------------------------------------# 
##### 3.2 | Party Leaning Intensity #####
#-------------------------------------------------------------------------------# 

# All with 0 party leaning are coded as 0 intensity

## plh0013_h	Party leaning intensity

soep$partyleanint <- soep$plh0013_h
val_labels(soep$plh0013_h)

# Reverse order so that high intensity is == 5 and low intensity == 1
soep$partyleanintrev <- (soep$plh0013_h - 6)*(-1)

soep <- soep %>%
  mutate(partyleanint = case_when(partyleand == 0 ~ 0, # no party leaning
                                    partyleanintrev == 1 ~ 1,
                                    partyleanintrev == 2 ~ 2,
                                    partyleanintrev == 3 ~ 3,
                                    partyleanintrev == 4 ~ 4,
                                    partyleanintrev == 5 ~ 5
  )) 


## plh0012_h	Party leaning (categories)

soep$partylean <- soep$plh0012_h
val_labels(soep$partylean)
table(soep$partylean)


#-------------------------------------------------------------------------------# 
##### 3.3 | Abstention & Party Federal Election #####
#-------------------------------------------------------------------------------# 

## plh0333	Party federal election
soep$plh0333

soep$partyfedel <- soep$plh0333
is.factor(soep$partyfedel)
val_labels(soep$partyfedel)

soep$partyfedel <- as.factor(soep$partyfedel)

# From plh0333 (2014, 2018)
table(soep$partyfedel, useNA = "always")

# From plh0006 (2010)
table(soep$plh0006, useNA = "always")

soep <- soep %>%
  mutate(abstention = case_when(partyfedel == 28 ~ 1,
                                plh0006 == 2 ~ 1,
                                partyfedel != 28 & is.na(partyfedel) == FALSE ~ 0,
                                plh0006 != 2 & is.na(plh0006) == FALSE ~ 0
  ))

val_labels(soep$abstention) <- c("Yes" = 1, 
                                  "No" = 0)

# Need to lead variable as election was one year before asked #

# Create new data frame with syear -1 
previous_year <- soep %>%
  # Select(syear, pid, abstention) %>%
  mutate(syear = syear - 1, lead_abstention = abstention) %>%
  select(syear, pid, lead_abstention)

# Merge with original soep based on pid and syear 
soep <- left_join(soep, previous_year, by = c("pid", "syear"))

val_labels(soep$lead_abstention) <- c("Yes" = 1, 
                                 "No" = 0)

# Voting in federal election (reverse abstention)
soep <- soep %>%
  mutate(votefedel_lead = case_when(lead_abstention == 1 ~ 0, 
                                    lead_abstention == 0 ~ 1)
  )

val_labels(soep$votefedel_lead) <- c("Yes" = 1, 
                                          "No" = 0)


#-------------------------------------------------------------------------------# 
##### 3.4 | Participation in Local Politics #####
#-------------------------------------------------------------------------------# 

## pli0097_h	Participate in local politics
soep$particlocalpol <- soep$pli0097_h
val_labels(soep$particlocalpol)

soep <- soep %>%
  mutate(particlocalpold = case_when(pli0097_h %in% c(1,4) ~ 1,
                                         pli0097_h %in% c(7,8) ~ 1,
                                         pli0097_h %in% c(5,6) ~ 0, # Never (or never/very rarely in 1984)
  ))

val_labels(soep$particlocalpold) <- c("Yes" = 1, 
                                      "No" = 0)

#-------------------------------------------------------------------------------# 
##### 3.5 | Satisfaction with democracy #####
#-------------------------------------------------------------------------------# 

# plh0152_v2	Satisfaction with democracy
soep$satwdemocr <- soep$plh0152_v2

#-------------------------------------------------------------------------------# 
##### 3.6 | Political orientation #####
#-------------------------------------------------------------------------------# 

## plh0004	Political orientation
soep$polor <- soep$plh0004

#-------------------------------------------------------------------------------# 
##### 3.7 | Other #####
#-------------------------------------------------------------------------------# 

## plh0007	Political interest
soep$polint <- soep$plh0007

# Reverse order so that high interest is == 4 and low interest == 1
soep$polintrev <- (soep$polint - 5)*(-1)



#-------------------------------------------------------------------------------# 
#### 4 | (Re)code Other Outcome Variables ####
#-------------------------------------------------------------------------------# 

## Worries ##
soep$worr_oecsit <- (soep$plh0033 - 4)*(-1) 
soep$worr_wpsec <- (soep$plh0042 - 4)*(-1) 


## Freetime variables ##

soep$ft_volunt <- soep$pli0096_h
soep$ft_volunt <- (soep$pli0096_h - 6)*(-1)
val_labels(soep$ft_volunt) <- c("Never" = 1, "Rarely" = 2, "Min 1x per month" = 3,
                                     "Min 1x per week" = 4, "Daily" = 5)

soep <- soep %>%
  mutate(ft_volunt_d = case_when(ft_volunt %in% c(2,3,4,5) ~ 1,
                                 ft_volunt %in% c(1) ~ 0, # Never
  ))

val_labels(soep$ft_volunt_d) <- c("Yes" = 1, 
                                       "No" = 0)


##	Participate in local politics
soep$ft_localpol <- soep$pli0097_h
soep$ft_localpol <- (soep$pli0097_h - 6)*(-1)
val_labels(soep$ft_localpol) <- c("Never" = 1, "Rarely" = 2, "Min 1x per month" = 3,
                                       "Min 1x per week" = 4, "Daily" = 5)


soep <- soep %>%
  mutate(ft_localpol_d = case_when(ft_localpol %in% c(2,3,4,5) ~ 1,
                                   ft_localpol %in% c(1) ~ 0, # Never
  ))

val_labels(soep$ft_localpol_d) <- c("Yes" = 1, 
                                         "No" = 0)

# Opera, classical concerts
soep$pli0090_h

soep$ft_opera <- (soep$pli0090_h - 6)*(-1)

val_labels(soep$ft_opera) <- c("Never" = 1, "Rarely" = 2, "Min 1x per month" = 3,
                                    "Min 1x per week" = 4, "Daily" = 5)


soep <- soep %>%
  mutate(ft_opera_d = case_when(ft_opera %in% c(2,3,4,5) ~ 1,
                                ft_opera %in% c(1) ~ 0, # Never
  ))


val_labels(soep$ft_opera_d) <- c("Yes" = 1, 
                                      "No" = 0)

# Cinema and concerts
soep$ft_cinema<- (soep$pli0091_h - 6)*(-1)

val_labels(soep$ft_cinema) <- c("Never" = 1, "Rarely" = 2, "Min 1x per month" = 3,
                                     "Min 1x per week" = 4, "Daily" = 5)

soep <- soep %>%
  mutate(ft_cinema_d = case_when(ft_cinema %in% c(2,3,4,5) ~ 1,
                                 ft_cinema %in% c(1) ~ 0, # Never
  ))

val_labels(soep$ft_cinema_d) <- c("Yes" = 1, 
                                       "No" = 0)

# Active sports
soep$ft_sports <- (soep$pli0092_h - 6)*(-1)


val_labels(soep$ft_sports) <- c("Never" = 1, "Rarely" = 2, "Min 1x per month" = 3,
                                     "Min 1x per week" = 4, "Daily" = 5)


soep <- soep %>%
  mutate(ft_sports_d = case_when(ft_sports %in% c(2,3,4,5) ~ 1,
                                 ft_sports %in% c(1) ~ 0, # Never
  ))

val_labels(soep$ft_sports_d) <- c("Yes" = 1, 
                                       "No" = 0)

# Artistic and music activities

soep$ft_artmus <- (soep$pli0093_h - 6)*(-1)


val_labels(soep$ft_artmus) <- c("Never" = 1, "Rarely" = 2, "Min 1x per month" = 3,
                                     "Min 1x per week" = 4, "Daily" = 5)


soep <- soep %>%
  mutate(ft_artmus_d = case_when(ft_artmus %in% c(2,3,4,5) ~ 1,
                                 ft_artmus %in% c(1) ~ 0, # Never
  ))

val_labels(soep$ft_artmus_d) <- c("Yes" = 1, 
                                       "No" = 0)


# Socialising with friends and family

soep$ft_social <- (soep$pli0094_h - 6)*(-1)


val_labels(soep$ft_social) <- c("Never" = 1, "Rarely" = 2, "Min 1x per month" = 3,
                                     "Min 1x per week" = 4, "Daily" = 5)

soep <- soep %>%
  mutate(ft_social_d = case_when(ft_social %in% c(2,3,4,5) ~ 1,
                                 ft_social %in% c(1) ~ 0, # Never
  ))

val_labels(soep$ft_social_d) <- c("Yes" = 1, 
                                       "No" = 0)


# Helping out with friends and family
soep$ft_help <- (soep$pli0095_h - 6)*(-1)

val_labels(soep$ft_help) <- c("Never" = 1, "Rarely" = 2, "Min 1x per month" = 3,
                                   "Min 1x per week" = 4, "Daily" = 5)

soep$ft_help

table(soep$ft_help, useNA = "always")

soep <- soep %>%
  mutate(ft_help_d = case_when(ft_help %in% c(2,3,4,5) ~ 1,
                               ft_help %in% c(1) ~ 0, # Never
  ))

table(soep$ft_help_d, useNA = "always")

val_labels(soep$ft_help_d) <- c("Yes" = 1, 
                                     "No" = 0)

# Church, religious events
soep$ft_church <- (soep$pli0098_h - 6)*(-1)


val_labels(soep$ft_church) <- c("Never" = 1, "Rarely" = 2, "Min 1x per month" = 3,
                                     "Min 1x per week" = 4, "Daily" = 5)

soep$ft_church

soep <- soep %>%
  mutate(ft_church_d = case_when(ft_church %in% c(2,3,4,5) ~ 1,
                                 ft_church %in% c(1) ~ 0, # Never
  ))

val_labels(soep$ft_church_d) <- c("Yes" = 1, 
                                       "No" = 0)

### Create Composite Index (mean) ###

soep$ft_index <- rowMeans(soep[, c("ft_volunt", "ft_localpol", "ft_opera", "ft_cinema", "ft_sports",
                                             "ft_artmus", "ft_social", "ft_help")], na.rm = TRUE)

soep$ft_index_cult <- rowMeans(soep[, c("ft_opera", "ft_cinema", "ft_artmus")], na.rm = TRUE)



#-------------------------------------------------------------------------------# 
#### 5 | (Re)code moving variables ####
#-------------------------------------------------------------------------------# 

#-------------------------------------------------------------------------------# 
##### 5.1 | Residential move #####
#-------------------------------------------------------------------------------# 
# resmove variable
soep$resmove
# Rename
soep$moved <- soep$resmove

table(soep$moved, useNA = "always")
summary(soep$moved)

# Label
val_labels(soep$moved) <- c("Moved" = 1, "Stayed" = 0) # need to be haven_labelled

#-------------------------------------------------------------------------------# 
##### 5.2 | Postcode-based move #####
#-------------------------------------------------------------------------------# 

# chg_zip
soep$chg_zip
table(soep$chg_zip, useNA = "always")

# --> create moved dummy based on plz change and compare 
soep = soep[order(soep$pid, soep$syear), ]

# create a lagged plz and recode all == 1 if plz != plz_lagged & plz == !is.na
soep <-  soep %>%
  group_by(pid) %>%
  dplyr::mutate(plz_lag1 = dplyr::lag(plz, n=1, default = NA)) %>%
  ungroup()

soep <- soep %>%
  dplyr::mutate(moved2 = case_when(plz == plz_lag1 ~ 0,
                            plz != plz_lag1 ~ 1))
table(soep$moved2, useNA ="always")

## Check for within PLZ-moves by comparing moved with moved2 ##
table(soep$moved, soep$moved2, exclude = NULL)

## Check validity of moved2 by comparing moved with chg_zip ##
table(soep$chg_zip, useNA ="always")
table(soep$moved2, useNA ="always")
table(soep$chg_zip, soep$moved2, exclude = NULL)

# --> there are quite some NAs in movedand chg_zip compared to moved2 --> used moved2 for analysis

# Postcode based moving (moved2) #
soep$moved_plz <- soep$moved2
val_labels(soep$moved_plz) <- c("Moved" = 1, "Stayed" = 0)

#-------------------------------------------------------------------------------# 
##### 5.3 | Kreis-based move #####
#-------------------------------------------------------------------------------# 

# create moved dummy based on krs change and compare 
soep$krs <- soep$kkz_rek

soep = soep[order(soep$pid, soep$syear), ]

# create a lagged plz and recode all == 1 if plz != plz_lagged & plz == !is.na
soep <-  soep %>%
  group_by(pid) %>%
  dplyr::mutate(krs_lag1 = dplyr::lag(krs, n=1, default = NA)) %>%
  ungroup()

soep <- soep %>%
  dplyr::mutate(moved_krs = case_when(krs == krs_lag1 ~ 0,
                                   krs != krs_lag1 ~ 1))


val_labels(soep$moved_krs) <- c("Moved" = 1, "Stayed" = 0)


# Create balanced moved2 variable
# Set as panel data
soep <- data.table(soep)

# Sort the data by id and time
setkey(soep, pid, syear)

# Create a lagged variable using the shift() function from data.table
soep[, krs_lag1_bal := shift(krs), by = pid]

# Set the lagged value to NA if the previous year is missing
soep[syear > 1 & syear - 1 != shift(syear), krs_lag1_bal := NA]

# Create balanced krs-move variable
soep <- soep %>%
  dplyr::mutate(moved_krs_bal = case_when(krs == krs_lag1_bal ~ 0,
                                      krs != krs_lag1_bal ~ 1))

table(soep$krs_lag1_bal, useNA ="always")

#-------------------------------------------------------------------------------# 
##### 5.4 | Bundesland-level move #####
#-------------------------------------------------------------------------------# 

# Create balanced bula move variable
# Set as panel data
soep <- data.table(soep)

# Sort the data by id and time
setkey(soep, pid, syear)

# Create a lagged variable using the shift() function from data.table
soep[, bula_lag1_bal := shift(bula), by = pid]

# Set the lagged value to NA if the previous year is missing
soep[syear > 1 & syear - 1 != shift(syear), bula_lag1_bal := NA]

# Create balanced krs-move variable
soep <- soep %>%
  dplyr::mutate(moved_bula_bal = case_when(bula == bula_lag1_bal ~ 0,
                                           bula != bula_lag1_bal ~ 1))



#-------------------------------------------------------------------------------# 
##### 5.5 | Moving distance #####
#-------------------------------------------------------------------------------# 

# Distance variable (in km)
soep$distance_km <- soep$distance/1000

soep <- soep %>% 
  mutate(distance_km_krs = case_when(moved_krs_bal == 1 ~ distance_km,
                                     moved_krs_bal == 0 | is.na(moved_krs_bal) == TRUE ~ 0))

soep <- soep %>% 
  mutate(distance_km_krs_rs = case_when(moved_krs_bal == 1 ~ distance_km/100,
                                        moved_krs_bal == 0 | is.na(moved_krs_bal) == TRUE ~ 0))

# Create categorical variable
# Until 50, 50 - 200, 200-400, > 400
soep <- soep %>% 
  mutate(distance_cat_krs = case_when(moved_krs_bal == 1 & distance_km <= 50  ~ 1,
                                      moved_krs_bal == 1 & distance_km > 50 & distance_km <= 200 ~ 2,
                                      moved_krs_bal == 1 & distance_km > 200 & distance_km <= 400 ~ 3, 
                                      moved_krs_bal == 1 & distance_km > 400 ~ 4,
                                      moved_krs_bal == 0 | is.na(moved_krs_bal) == TRUE ~ 1)) # Re-think whether this makes sense!

# With 3 categories
soep <- soep %>% 
  mutate(distance_cat3_krs = case_when(moved_krs_bal == 1 & distance_km <= 50  ~ 1,
                                       moved_krs_bal == 1 & distance_km > 50 & distance_km <= 200 ~ 2,
                                       moved_krs_bal == 1 & distance_km > 200  ~ 3, 
                                       moved_krs_bal == 0 | is.na(moved_krs_bal) == TRUE ~ 1)) # Re-think whether this makes sense!


#-------------------------------------------------------------------------------# 

#### Assign Variable Labels ####

var_label(soep)
var_label(soep) <- list(sex = "Gender",
  region = "Region of residence",
  region_lag1 = "Previous region of residence",
  west = "Region of residence (east/west)",
  west_lag1 = "Previous region of residence (east/west)",
  income = "Disposable HH-income",
  income_clean = "Disposable HH-income", # excludes those in education
  income_grp = "Income group (based on disp. HH-income)",
  income_grp_clean = "Income group (based on disp. HH-income)", # excludes those in education
  age = "Age",
  age_grp = "Age groups",
  age_grp3 = "Age groups",
  marstatag = "Marital status",
  hhtype = "Household type",
  childd = "Children",
  educ = "Education level",
  colleged = "College degree",
  migbackd = "Migration background",
  egpag_ext = "Aggregated EGP-Class (extended)",
  egpag_ext_lag1 = "Previous aggregated EGP-Class (extended)",
  pgemplst = "Employment status",
  pgemplst_lag1 = "Previous employment status",
  emplst_d_lag1 = "Employment status (aggregated)",
  emplst_d_lag1 = "Previous employment status (aggregated)",
  
  polor = "Political orientation (left-right)",
  partylean = "Party leaning (categories)",
  partyleand = "Party leaning (yes/no)",
  partyleand2 = "Party leaning (yes/no)", # including don't knows as 0
  partyleanint = "Party leaning intensity",
  abstention = "Abstention in federal elections (yes/no)",
  lead_abstention = "Abstention in federal elections (yes/no)",
  particlocalpol = "Participate in local politics (yes/no)",
  satwdemocr = "Satisfaction with democracy",
  moved = "Moving status (answered)",
  moved_plz = "Moving status (plz)",
  moved_krs = "Moving status (krs)",
  moved_krs_bal = "Moving status (krs), balanced",
  distance_km = "Distance of move (km)"
)

rm(list = setdiff(ls(), c("soep")))


#-------------------------------------------------------------------------------# 
#### B | Merge OI and code additional variables ####
#-------------------------------------------------------------------------------# 
orig_path <- "data_original/"
data_path <- "data_created/"
#-------------------------------------------------------------------------------# 
#### 6 | Load and merge data ####
#-------------------------------------------------------------------------------# 

# Opportunity Index with a knowledge economy focus
oi_v4 <- read_csv(paste0(data_path, 'opportunity_KE_by_krs.csv'))

# Remove first column
oi_v4 <- oi_v4[,-c(1)]

# Merge soep with OI
soepoi_v4 <- full_join(soep, oi_v4, by = "krs")

# Check coding
soepoi_v4[100:300, c("syear", "pid", "krs", "kkz_rek", "Kommune")]

soepoi_v4 <- soepoi_v4 %>%
  subset(is.na(pid) == FALSE)


#-------------------------------------------------------------------------------# 
#### 7 | Calculate moving variables for descriptives and models ####
#-------------------------------------------------------------------------------# 

### Main independent variables for regression model #

#-------------------------------------------------------------------------------# 
##### 7.1 | Index values #####
#-------------------------------------------------------------------------------# 

## Full index ##

# Rescale
soepoi_v4$opportunity_KE_pc01_wn10 <- soepoi_v4$opportunity_KE_pc01_wn*10


## Lag variable for descriptives (unbalanced)

soepoi_v4 = soepoi_v4[order(soepoi_v4$pid, soepoi_v4$syear), ]

soepoi_v4 <- soepoi_v4 %>%
  group_by(pid) %>%
  dplyr::mutate(oiw_lag1_ub = dplyr::lag(opportunity_KE_pc01_wn, n=1, default = NA)) %>%
  as.data.frame() %>%
  ungroup()

soepoi_v4 <-  soepoi_v4 %>%
  group_by(pid) %>%
  dplyr::mutate(oiw_lag2_ub = dplyr::lag(opportunity_KE_pc01_wn, n=2, default = NA)) %>%
  ungroup()


soepoi_v4 <-  soepoi_v4 %>%
  group_by(pid) %>%
  dplyr::mutate(oiw_lag3_ub = dplyr::lag(opportunity_KE_pc01_wn, n=3, default = NA)) %>%
  ungroup()


## Lag variable for models (balanced)

# Set as panel data
soepoi_v4 <- data.table(soepoi_v4)

# Sort the data by id and time
setkey(soepoi_v4, pid, syear)

# Create a lagged variable using the shift() function from data.table
soepoi_v4[, oiw_lag1 := shift(opportunity_KE_pc01_wn), by = pid]

# Set the lagged value to NA if the previous year is missing
soepoi_v4[syear > 1 & syear - 1 != shift(syear), oiw_lag1 := NA]

# Repeat for lag2 and lag3
soepoi_v4[, oiw_lag2 := shift(opportunity_KE_pc01_wn, n = 2L), by = pid]

soepoi_v4[syear > 1 & syear - 2 != shift(syear, n = 2L), oiw_lag2 := NA]

soepoi_v4[, oiw_lag3 := shift(opportunity_KE_pc01_wn, n = 3L), by = pid]

soepoi_v4[syear > 1 & syear - 3 != shift(syear, n = 3L), oiw_lag3 := NA]


#------------------------------------------#
## Lead variable for models (balanced)

# Set as panel data
soepoi_v4 <- data.table(soepoi_v4)

# Sort the data by id and time
setkey(soepoi_v4, pid, syear)

# Create a lead variable using the shift() function from data.table
soepoi_v4[, oiw_lead1 := shift(opportunity_KE_pc01_wn, n = 1L, type = "lead"), by = pid]

# Set the lagged value to NA if the previous year is missing
soepoi_v4[syear > 1 & syear + 1 != shift(syear, n = 1L, type = "lead"), oiw_lead1 := NA]

# Repeat for lag2 and lag3
soepoi_v4[, oiw_lead2 := shift(opportunity_KE_pc01_wn, n = 2L, type = "lead"), by = pid]
table(soepoi_v4$oiw_lead2, useNA = "always")

soepoi_v4[syear > 1 & syear + 2 != shift(syear, n = 2L, type = "lead"), oiw_lead2 := NA]
table(soepoi_v4$oiw_lead2, useNA = "always")

soepoi_v4[, oiw_lead3 := shift(opportunity_KE_pc01_wn, n = 3L, type = "lead"), by = pid]
table(soepoi_v4$oiw_lead3, useNA = "always")

soepoi_v4[syear > 1 & syear + 3 != shift(syear, n = 3L, type = "lead"), oiw_lead3 := NA]
table(soepoi_v4$oiw_lead3, useNA = "always")

## Components ##

# Work: pc1_KE_01_wn #
table(soepoi_v4$pc1_KE_01_wn)

# Rescale
soepoi_v4$pc1_KE_01_wn10 <- soepoi_v4$pc1_KE_01_wn*10

# amen: pc2_KE_01_wn #
table(soepoi_v4$pc2_KE_01_wn)

# Rescale
soepoi_v4$pc2_KE_01_wn10 <- soepoi_v4$pc2_KE_01_wn*10

#-------------------------------------------------------------------------------# 
##### 7.2 | Moving direction (opmove) #####
#-------------------------------------------------------------------------------# 

#-------------------------------------------------------------------------------# 
##### 7.2.1 | Opmove unbalanced #####
#-------------------------------------------------------------------------------# 

## Categorical variable for moving direction 

# Calculate index change (oichange)

# Create oichange
soepoi_v4$oichange_ub <- (soepoi_v4$opportunity_KE_pc01_wn - soepoi_v4$oiw_lag1_ub)

# Create opmove
soepoi_v4$opmove_ub <- NA
soepoi_v4$opmove_ub[soepoi_v4$oichange_ub > 0] <- 1 # up
soepoi_v4$opmove_ub[soepoi_v4$oichange_ub == 0] <- 2 # stable
soepoi_v4$opmove_ub[soepoi_v4$oichange_ub < 0] <- 3 # down


# Assign value labels
val_labels(soepoi_v4$opmove_ub) <- c("Up" = 1, "Stable" = 2, "Down" = 3)



#-----------------------------------------------#
# By components #

# Create work_ichange
soepoi_v4 = soepoi_v4[order(soepoi_v4$pid, soepoi_v4$syear), ]

soepoi_v4 <- soepoi_v4 %>%
  group_by(pid) %>%
  dplyr::mutate(work_iw_lag1_ub = dplyr::lag(pc1_KE_01_wn, n=1, default = NA)) %>%
  as.data.frame() %>%
  ungroup()

soepoi_v4$work_ichange_ub <- (soepoi_v4$pc1_KE_01_wn - soepoi_v4$work_iw_lag1_ub)


## Create variable for work opportunity moves ##

## work_opmove ##
soepoi_v4$work_opmove_ub <- NA
soepoi_v4$work_opmove_ub[soepoi_v4$work_ichange_ub > 0] <- 1 # up
soepoi_v4$work_opmove_ub[soepoi_v4$work_ichange_ub == 0] <- 2 # stable
soepoi_v4$work_opmove_ub[soepoi_v4$work_ichange_ub < 0] <- 3 # down



val_labels(soepoi_v4$work_opmove_ub) # assign value labels
val_labels(soepoi_v4$work_opmove_ub) <- c("Up" = 1, "Stable" = 2, "Down" = 3)



#-----------------------------------------------#
# Create amen_ichange
# Create work_ichange
soepoi_v4 = soepoi_v4[order(soepoi_v4$pid, soepoi_v4$syear), ]

soepoi_v4 <- soepoi_v4 %>%
  group_by(pid) %>%
  dplyr::mutate(amen_iw_lag1_ub = dplyr::lag(pc2_KE_01_wn, n=1, default = NA)) %>%
  as.data.frame() %>%
  ungroup()

soepoi_v4$amen_ichange_ub <- (soepoi_v4$pc2_KE_01_wn - soepoi_v4$amen_iw_lag1_ub)


## Create variable for amen opportunity moves ##

## amen_opmove ##
soepoi_v4$amen_opmove_ub <- NA
soepoi_v4$amen_opmove_ub[soepoi_v4$amen_ichange_ub > 0] <- 1 # up
soepoi_v4$amen_opmove_ub[soepoi_v4$amen_ichange_ub == 0] <- 2 # stable
soepoi_v4$amen_opmove_ub[soepoi_v4$amen_ichange_ub < 0] <- 3 # down

val_labels(soepoi_v4$amen_opmove_ub) # assign value labels
val_labels(soepoi_v4$amen_opmove_ub) <- c("Up" = 1, "Stable" = 2, "Down" = 3)



#Upmove, downmove dummies (unbalanced) whole index, PC1, PC2
soepoi_v4 <- soepoi_v4 %>%
  dplyr::mutate(upmove_ub = case_when(opmove_ub == 1 ~ 1,
                                      opmove_ub == 2 | opmove_ub == 3 ~ 0),
                downmove_ub = case_when(opmove_ub == 3 ~ 1,
                                        opmove_ub == 1 | opmove_ub == 2 ~ 0),
                
                
                PC1_upmove_ub = case_when(work_opmove_ub == 1 ~ 1,
                                          work_opmove_ub == 2 | work_opmove_ub == 3 ~ 0),
                PC1_downmove_ub = case_when(work_opmove_ub == 3 ~ 1,
                                            work_opmove_ub == 1 | work_opmove_ub == 2 ~ 0),
                
                PC2_upmove_ub = case_when(amen_opmove_ub == 1 ~ 1,
                                          amen_opmove_ub == 2 | amen_opmove_ub == 3 ~ 0),
                PC2_downmove_ub = case_when(amen_opmove_ub == 3 ~ 1,
                                            amen_opmove_ub == 1 | amen_opmove_ub == 2 ~ 0)
                
                
  )

#-------------------------------------------------------------------------------# 
##### 7.2.2 | Opmove balanced #####
#-------------------------------------------------------------------------------# 

## Categorical variable for moving direction 

# Calculate index change (oichange)

# Create oichange
soepoi_v4$oichange <- (soepoi_v4$opportunity_KE_pc01_wn - soepoi_v4$oiw_lag1)


# Create opmove
soepoi_v4$opmove <- NA
soepoi_v4$opmove[soepoi_v4$oichange > 0] <- 1 # up
soepoi_v4$opmove[soepoi_v4$oichange == 0] <- 2 # stable
soepoi_v4$opmove[soepoi_v4$oichange < 0] <- 3 # down


# Assign value labels
val_labels(soepoi_v4$opmove) <- c("Up" = 1, "Stable" = 2, "Down" = 3)

#-----------------------------------------------#
# By components #

# Create work_ichange

# Set as panel data and sort
soepoi_v4 <- data.table(soepoi_v4)
setkey(soepoi_v4, pid, syear)

# Create lagged value
soepoi_v4[, work_iw_lag1 := shift(pc1_KE_01_wn), by = pid]

# Set the lagged value to NA if the previous year is missing
soepoi_v4[syear > 1 & syear - 1 != shift(syear), work_iw_lag1 := NA]

soepoi_v4$work_ichange <- (soepoi_v4$pc1_KE_01_wn - soepoi_v4$work_iw_lag1)


## Create variable for work opportunity moves ##

## work_opmove ##
soepoi_v4$work_opmove <- NA
soepoi_v4$work_opmove[soepoi_v4$work_ichange > 0] <- 1 # up
soepoi_v4$work_opmove[soepoi_v4$work_ichange == 0] <- 2 # stable
soepoi_v4$work_opmove[soepoi_v4$work_ichange < 0] <- 3 # down

val_labels(soepoi_v4$work_opmove) # assign value labels
val_labels(soepoi_v4$work_opmove) <- c("Up" = 1, "Stable" = 2, "Down" = 3)


#-----------------------------------------------#
# Create amen_ichange

# Set as panel data and sort
soepoi_v4 <- data.table(soepoi_v4)
setkey(soepoi_v4, pid, syear)

# Create lagged value
soepoi_v4[, amen_iw_lag1 := shift(pc2_KE_01_wn), by = pid]

# Set the lagged value to NA if the previous year is missing
soepoi_v4[syear > 1 & syear - 1 != shift(syear), amen_iw_lag1 := NA]

soepoi_v4$amen_ichange <- (soepoi_v4$pc2_KE_01_wn - soepoi_v4$amen_iw_lag1)


## Create variable for amen opportunity moves ##

## amen_opmove ##
soepoi_v4$amen_opmove <- NA
soepoi_v4$amen_opmove[soepoi_v4$amen_ichange > 0] <- 1 # up
soepoi_v4$amen_opmove[soepoi_v4$amen_ichange == 0] <- 2 # stable
soepoi_v4$amen_opmove[soepoi_v4$amen_ichange < 0] <- 3 # down

val_labels(soepoi_v4$amen_opmove) # assign value labels
val_labels(soepoi_v4$amen_opmove) <- c("Up" = 1, "Stable" = 2, "Down" = 3)


# Up- and Downmove dummies

soepoi_v4 <- soepoi_v4 %>%
  dplyr::mutate(upmove = case_when(opmove == 1 ~ 1,
                                   opmove == 2 | opmove == 3 ~ 0))

soepoi_v4 <- soepoi_v4 %>%
  dplyr::mutate(downmove = case_when(opmove == 3 ~ 1,
                                     opmove == 1 | opmove == 2 ~ 0))


#-------------------------------------------------------------------------------# 
##### 7.3 | Opportunity Quintiles #####
#-------------------------------------------------------------------------------# 

## Create variable with quintiles of opportunity value ##

## Full Index ##

# Indexchange unweighted

soepoi_v4 <- soepoi_v4 %>%
  group_by(syear) %>%
  mutate(oiqtl = ntile(opportunity_KE_pc01_wn, 5)) %>%
  ungroup()

# Lag
# Set as panel data and sort
soepoi_v4 <- data.table(soepoi_v4)
setkey(soepoi_v4, pid, syear)

# Create lagged value and set the lagged value to NA if the previous year is missing
soepoi_v4[, oiqtl_lag1 := shift(oiqtl), by = pid]
soepoi_v4[syear > 1 & syear - 1 != shift(syear), oiqtl_lag1 := NA]


# weighted

# Set survey design
ds_svy_design_phrf1 <- survey::svydesign(data = soepoi_v4, ids = ~pid, weights = ~phrf1)

svyquantiles_oi <- svyby(formula = ~opportunity_KE_pc01_wn,
                         by = ~syear,
                         design = ds_svy_design_phrf1,
                         FUN = svyquantile,
                         quantiles =c(0.2, 0.4, 0.6, 0.8, 1))

svyquantiles_oi <- svyquantiles_oi %>%
  select(-c(se.opportunity_KE_pc01_wn.0.2,se.opportunity_KE_pc01_wn.0.4, se.opportunity_KE_pc01_wn.0.6,
            se.opportunity_KE_pc01_wn.0.8, se.opportunity_KE_pc01_wn.1))


soepoi_v4 <-  left_join(soepoi_v4, svyquantiles_oi, by = "syear")

# Recode as Q1-Q5
soepoi_v4$oiqtl_wn <-  NA
soepoi_v4$oiqtl_wn[soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.2] <- 1

soepoi_v4$oiqtl_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.2) & 
                     (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.4)] <- 2

soepoi_v4$oiqtl_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.4) & 
                     (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.6)] <- 3

soepoi_v4$oiqtl_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.6) & 
                     (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.8)] <- 4

soepoi_v4$oiqtl_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.8) & 
                     (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.1)] <- 5


# Lagged variable
# Set as panel data and sort
soepoi_v4 <- data.table(soepoi_v4)
setkey(soepoi_v4, pid, syear)

# Create lagged value and set the lagged value to NA if the previous year is missing
soepoi_v4[, oiqtl_wn_lag1 := shift(oiqtl_wn), by = pid]
soepoi_v4[syear > 1 & syear - 1 != shift(syear), oiqtl_wn_lag1 := NA]

# need to recode oiqtl for descriptive analysis
val_labels(soepoi_v4$oiqtl_wn_lag1) <- c("Q1" = 1, "Q2" = 2, "Q3" = 3, "Q4" = 4, "Q5" = 5)



## Deciles ##
soepoi_v4 = soepoi_v4 %>%
  select(-c(opportunity_KE_pc01_wn.0.2, opportunity_KE_pc01_wn.0.4, opportunity_KE_pc01_wn.0.6, 
            opportunity_KE_pc01_wn.0.8, opportunity_KE_pc01_wn.1))

# Indexchange unweighted
soepoi_v4 <- soepoi_v4 %>%
  group_by(syear) %>%
  mutate(oidecile = ntile(opportunity_KE_pc01_wn, 10)) %>%
  ungroup()

# Lag
# Set as panel data and sort
soepoi_v4 <- data.table(soepoi_v4)
setkey(soepoi_v4, pid, syear)

# Create lagged value and set the lagged value to NA if the previous year is missing
soepoi_v4[, oidecile_lag1 := shift(oidecile), by = pid]
soepoi_v4[syear > 1 & syear - 1 != shift(syear), oidecile_lag1 := NA]


# weighted

# Set survey design
ds_svy_design_phrf1 <- survey::svydesign(data = soepoi_v4, ids = ~pid, weights = ~phrf1)

svydeciles_oi <- svyby(formula = ~opportunity_KE_pc01_wn,
                       by = ~syear,
                       design = ds_svy_design_phrf1,
                       FUN = svyquantile,
                       quantiles =c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1))

svydeciles_oi <- svydeciles_oi %>%
  select(-c(se.opportunity_KE_pc01_wn.0.1, se.opportunity_KE_pc01_wn.0.2, se.opportunity_KE_pc01_wn.0.3,
            se.opportunity_KE_pc01_wn.0.4, se.opportunity_KE_pc01_wn.0.5, se.opportunity_KE_pc01_wn.0.6,
            se.opportunity_KE_pc01_wn.0.7, se.opportunity_KE_pc01_wn.0.8, se.opportunity_KE_pc01_wn.0.9, se.opportunity_KE_pc01_wn.1))

svydeciles_oi
# Note some earlier years have NA for qtl 1 -- 2011 onwards not affected though

soepoi_v4 <-  left_join(soepoi_v4, svydeciles_oi, by = "syear")


# Recode as Q1-Q5
soepoi_v4$oidecile_wn <-  NA
soepoi_v4$oidecile_wn[soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.1] <- 1

soepoi_v4$oidecile_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.1) & 
                        (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.2)] <- 2

soepoi_v4$oidecile_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.2) & 
                        (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.3)] <- 3

soepoi_v4$oidecile_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.3) & 
                        (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.4)] <- 4

soepoi_v4$oidecile_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.4) & 
                        (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.5)] <- 5

soepoi_v4$oidecile_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.5) & 
                        (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.6)] <- 6

soepoi_v4$oidecile_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.6) & 
                        (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.7)] <- 7

soepoi_v4$oidecile_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.7) & 
                        (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.8)] <- 8

soepoi_v4$oidecile_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.8) & 
                        (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.0.9)] <- 9

soepoi_v4$oidecile_wn[(soepoi_v4$opportunity_KE_pc01_wn > soepoi_v4$opportunity_KE_pc01_wn.0.9) & 
                        (soepoi_v4$opportunity_KE_pc01_wn <= soepoi_v4$opportunity_KE_pc01_wn.1)] <- 10


# Lagged variable
# Set as panel data and sort
soepoi_v4 <- data.table(soepoi_v4)
setkey(soepoi_v4, pid, syear)

# Create lagged value and set the lagged value to NA if the previous year is missing
soepoi_v4[, oidecile_wn_lag1 := shift(oidecile_wn), by = pid]
soepoi_v4[syear > 1 & syear - 1 != shift(syear), oidecile_wn_lag1 := NA]

# need to recode oiqtl for descriptive analysis
val_labels(soepoi_v4$oidecile_wn_lag1) <- c("Q1" = 1, "Q2" = 2, "Q3" = 3, "Q4" = 4, "Q5" = 5, "Q6" = 6, "Q7" = 7, "Q8" = 8, "Q9" = 9, "Q10" = 10)


# Recode decile variable so that reference group are bottom 3 / top 3 deciles
soepoi_v4$oidecile_wn_aggr_up <- soepoi_v4$oidecile_wn
soepoi_v4$oidecile_wn_aggr_up[soepoi_v4$oidecile_wn %in% c(1,2,3,4,5)] <- 1 # Replace bottom deciles with 1
soepoi_v4$oidecile_wn_aggr_down <- soepoi_v4$oidecile_wn

# Replace top deciles with 10
soepoi_v4$oidecile_wn_aggr_down[soepoi_v4$oidecile_wn %in% c(6,7,8,9,10)] <- 1 # Replace top deciles with 1
soepoi_v4$oidecile_wn_aggr_down[soepoi_v4$oidecile_wn %in% c(1)] <- 6 # Workaround: 1 = 6



#-----------------------------------------------#
## Components ##
#-----------------------------------------------#

#-----------------------------------------------#
# Work #
#-----------------------------------------------#

# Set survey design to account for distribution in population
ds_svy_design_phrf1 <- survey::svydesign(data = soepoi_v4, ids = ~pid, weights = ~phrf1)

svyquantiles_worki <- svyby(formula = ~pc1_KE_01_wn,
                            by = ~syear,
                            design = ds_svy_design_phrf1,
                            FUN = svyquantile,
                            quantiles =c(0.2, 0.4, 0.6, 0.8, 1))

svyquantiles_worki <- svyquantiles_worki %>%
  select(-c(se.pc1_KE_01_wn.0.2,se.pc1_KE_01_wn.0.4, se.pc1_KE_01_wn.0.6,
            se.pc1_KE_01_wn.0.8, se.pc1_KE_01_wn.1))


soepoi_v4 <-  left_join(soepoi_v4, svyquantiles_worki, by = "syear")

# Recode as Q1-Q5
soepoi_v4$workiqtl_wn <-  NA
soepoi_v4$workiqtl_wn[soepoi_v4$pc1_KE_01_wn <= soepoi_v4$pc1_KE_01_wn.0.2] <- 1

soepoi_v4$workiqtl_wn[(soepoi_v4$pc1_KE_01_wn > soepoi_v4$pc1_KE_01_wn.0.2) & 
                        (soepoi_v4$pc1_KE_01_wn <= soepoi_v4$pc1_KE_01_wn.0.4)] <- 2

soepoi_v4$workiqtl_wn[(soepoi_v4$pc1_KE_01_wn > soepoi_v4$pc1_KE_01_wn.0.4) & 
                        (soepoi_v4$pc1_KE_01_wn <= soepoi_v4$pc1_KE_01_wn.0.6)] <- 3

soepoi_v4$workiqtl_wn[(soepoi_v4$pc1_KE_01_wn > soepoi_v4$pc1_KE_01_wn.0.6) & 
                        (soepoi_v4$pc1_KE_01_wn <= soepoi_v4$pc1_KE_01_wn.0.8)] <- 4

soepoi_v4$workiqtl_wn[(soepoi_v4$pc1_KE_01_wn > soepoi_v4$pc1_KE_01_wn.0.8) & 
                        (soepoi_v4$pc1_KE_01_wn <= soepoi_v4$pc1_KE_01_wn.1)] <- 5


# Code lag workiqtl_wn 
# Set as panel data and sort
soepoi_v4 <- data.table(soepoi_v4)
setkey(soepoi_v4, pid, syear)

# Create lagged value and set the lagged value to NA if the previous year is missing
soepoi_v4[, workiqtl_wn_lag1 := shift(workiqtl_wn), by = pid]
soepoi_v4[syear > 1 & syear - 1 != shift(syear), workiqtl_wn_lag1 := NA]


# need to recode workiqtl_wn for descriptive analysis
val_labels(soepoi_v4$workiqtl_wn_lag1) <- c("Q1" = 1, "Q2" = 2, "Q3" = 3, "Q4" = 4, "Q5" = 5)


#-----------------------------------------------#
# Amenities #
#-----------------------------------------------#

# Set survey design to account for distribution in population
svyquantiles_ameni <- svyby(formula = ~pc2_KE_01_wn,
                            by = ~syear,
                            design = ds_svy_design_phrf1,
                            FUN = svyquantile,
                            quantiles =c(0.2, 0.4, 0.6, 0.8, 1))

svyquantiles_ameni <- svyquantiles_ameni %>%
  select(-c(se.pc2_KE_01_wn.0.2,se.pc2_KE_01_wn.0.4, se.pc2_KE_01_wn.0.6,
            se.pc2_KE_01_wn.0.8, se.pc2_KE_01_wn.1))

soepoi_v4 <-  left_join(soepoi_v4, svyquantiles_ameni, by = "syear")

# Recode as Q1-Q5
soepoi_v4$ameniqtl_wn <-  NA
soepoi_v4$ameniqtl_wn[soepoi_v4$pc2_KE_01_wn <= soepoi_v4$pc2_KE_01_wn.0.2] <- 1

soepoi_v4$ameniqtl_wn[(soepoi_v4$pc2_KE_01_wn > soepoi_v4$pc2_KE_01_wn.0.2) & 
                        (soepoi_v4$pc2_KE_01_wn <= soepoi_v4$pc2_KE_01_wn.0.4)] <- 2

soepoi_v4$ameniqtl_wn[(soepoi_v4$pc2_KE_01_wn > soepoi_v4$pc2_KE_01_wn.0.4) & 
                        (soepoi_v4$pc2_KE_01_wn <= soepoi_v4$pc2_KE_01_wn.0.6)] <- 3

soepoi_v4$ameniqtl_wn[(soepoi_v4$pc2_KE_01_wn > soepoi_v4$pc2_KE_01_wn.0.6) & 
                        (soepoi_v4$pc2_KE_01_wn <= soepoi_v4$pc2_KE_01_wn.0.8)] <- 4

soepoi_v4$ameniqtl_wn[(soepoi_v4$pc2_KE_01_wn > soepoi_v4$pc2_KE_01_wn.0.8) & 
                        (soepoi_v4$pc2_KE_01_wn <= soepoi_v4$pc2_KE_01_wn.1)] <- 5


# Code a lag ameniqtl_wn 
soepoi_v4 = soepoi_v4[order(soepoi_v4$pid, soepoi_v4$syear), ]

soepoi_v4 <-  soepoi_v4 %>%
  group_by(pid) %>%
  dplyr::mutate(ameniqtl_wn_lag1 = dplyr::lag(ameniqtl_wn, n=1, default = NA)) %>%
  ungroup()

# need to recode ameniqtl_wn for descriptive analysis
val_labels(soepoi_v4$ameniqtl_wn_lag1) <- c("Q1" = 1, "Q2" = 2, "Q3" = 3, "Q4" = 4, "Q5" = 5)


#-------------------------------------------------------------------------------# 
#### Assign Variable Labels ####
#-------------------------------------------------------------------------------# 

# soepoi_v4 # 
var_label(soepoi_v4)

var_label(soepoi_v4) <- list(
  opportunity_KE_pc01_wn = "Opportunity index",
  opportunity_KE_pc01_wn10 = "Opportunity index (10p)",
  
  pc1_KE_01_wn = "Opportunity index (work)",
  pc1_KE_01_wn10 = "Opportunity index (work, 10p)",
  
  pc2_KE_01_wn = "Opportunity index (wealth)",
  pc2_KE_01_wn10 = "Opportunity index (wealth, 10p)",
  
  opmove = "Opportunity move",
  work_opmove = "Work opportunity move",
  amen_opmove = "Amenities opportunity move",
  
  opmove_ub = "Opportunity move (unbalanced)",
  work_opmove_ub = "Work opportunity move (unbalanced)",
  amen_opmove_ub = "Amenities opportunity move (unbalanced)",

  oiqtl = "Quintile OI",
  oiqtl_lag1 = "Previous quintile OI",
  oiqtl_wn = "Quintile OI (weighted)",
  oiqtl_wn_lag1 = "Previous quintile OI (weighted)",
  
  workiqtl_wn = "Work quintile (weighted)",
  ameniqtl_wn = "Amenities quintile (weighted)",
  
  workiqtl_wn_lag1 = "Previous work quintile (weighted)",
  ameniqtl_wn_lag1 = "Previous amenities quintile (weighted)"
  

)

#-------------------------------------------------------------------------------# 

#### Save dataset ####
save(soepoi_v4, file = (paste0(data_path, "soepoi_v4.RData")))

rm(list = setdiff(ls(), c("soepoi_v4")))

#-------------------------------------------------------------------------------# 

#-------------------------------------------------------------------------------# 
#### C | LMR-level data ####
#-------------------------------------------------------------------------------# 
orig_path <- "data_original/"
data_path <- "data_created/"
#-------------------------------------------------------------------------------# 
#### 8 | Re-weight OI for LMR analysis ####
#-------------------------------------------------------------------------------# 

#-------------------------------------------------------------------------------# 
##### 8.1 Load and merge data and cross-walks ##### 
#-------------------------------------------------------------------------------# 

# Extract population data from SOEP
bev_soep <- soepoi_v4 %>%
  ungroup() %>%
  filter(syear == 2017) %>%
  select(kreis = kkz_rek,
         nuts3 = nuts3,
         bev17 = kr_population) %>%
  distinct(kreis, nuts3, bev17) # keep only distinct values

# Import OI by Kreis separately
oi_v4 <- read_csv(paste0(data_path, 'opportunity_KE_by_krs.csv'))

# Import cross-walk from BBSR (version: 2021)
CWlmr <- read_excel(paste0(orig_path, "cross-walks/raumgliederungen-referenzen-2021.xlsx"), sheet = "Kreisreferenz")

CWlmr <- CWlmr %>%
  select( kreis = KRS2021,
          kreis_name = KRS_NAME,
          lmr_cd = KAM2021,
          lmr_name = KAM_NAME, 
          nuts3_cd = N3D2021,
          nuts3_name = N3D_NAME
          
  )

# Drop first row containing variable descriptions
CWlmr <- CWlmr %>%
  slice(-1) 

#-------------------------------------------------------------------------------# 
## Merge data ##
#-------------------------------------------------------------------------------# 

# Merge population data (at kreis-level) to cross-walk data
# Drop last 3 0 digits in krs code from admin data
CWlmr$krs <- substr(CWlmr$kreis, 1, nchar(CWlmr$kreis) - 3)
CWlmr <- CWlmr %>%
  select(-kreis)
CWlmr$krs <- as.numeric(CWlmr$krs)

bev_soep$krs <- as.numeric(bev_soep$kreis)

df_admin <- left_join(bev_soep, CWlmr, by = "krs")

# Manually add LMR info for Eisenach prev kreisfreie stadt, which is not anymore included in the 2021 CW from krs --> AMR
df_admin <- df_admin %>%
  mutate(
    kreis_name = ifelse(krs == 16056, "Eisenach, kreisfreie Stadt", kreis_name),
    lmr_cd = ifelse(krs == 16056, 214, lmr_cd), # lmr Eisenach
    lmr_name = ifelse(krs == 16056, "Eisenach", lmr_name),
    nuts3_name = ifelse(krs == 16056, "Eisenach, kreisfreie Stadt", nuts3_name),
    nuts3_cd = ifelse(krs == 16056, "DEG0N", nuts3_cd)
    
  )


# Calculate population share (share of kreis-level population within labor market region --> weighting)
df_admin <- df_admin %>%
  group_by(lmr_cd) %>%                           # Step 1: Group by labor market region
  mutate(total_pop_lmr = sum(bev17, na.rm = TRUE),  # Step 2: Calculate total population per labor market region
         bev_krs_sh = bev17 / total_pop_lmr) %>%
  ungroup()


# Merge admin data with opportunity index data by 

# Check population weights
df_admin <- df_admin %>%
  group_by(lmr_cd) %>%                          
  mutate(check_pop = sum(bev_krs_sh, na.rm = TRUE)) %>%
  ungroup()

# Merge
df <- left_join(oi_v4, df_admin, by = "krs")

# Select vars
names(df)

df_sel <- df %>%
  select("krs", "opportunity_KE_pc01", "pc1_KE_01", "pc2_KE_01", "kreis_name", "lmr_cd", "lmr_name", "nuts3_cd", "nuts3_name", "bev17", "total_pop_lmr", "bev_krs_sh")


#-------------------------------------------------------------------------------# 
##### 8.2 Compute weighted index #####
#-------------------------------------------------------------------------------# 

# Re-weight krs index and components by population within AMRs
df_sel <- df_sel %>%
  mutate(opportunity_KE_pc01_pw = opportunity_KE_pc01 * bev_krs_sh,
         pc1_KE_01_pw = pc1_KE_01 * bev_krs_sh,
         pc2_KE_01_pw = pc2_KE_01 * bev_krs_sh
  )

# Sum oi by lmr
df_sel_lmr <- df_sel %>%
  group_by(lmr_cd, lmr_name) %>%      
  summarise(opportunity_KE_pc01_lmr = sum(opportunity_KE_pc01_pw, na.rm = TRUE),
            pc1_KE_01_lmr = sum(pc1_KE_01_pw, na.rm = TRUE),
            pc2_KE_01_lmr = sum(pc2_KE_01_pw, na.rm = TRUE)
  ) %>%
  ungroup()

# Create final data set for merge with SOEP data via krs
df_sel_lmr <- df_sel_lmr %>% select(-lmr_name)
names(df_sel_lmr)
names(df_sel)

df_oi_lmr <- left_join(df_sel, df_sel_lmr, by = "lmr_cd") %>%
  select(krs, kreis_name, lmr_cd, lmr_name, nuts3_cd, nuts3_name, total_pop_lmr, opportunity_KE_pc01_lmr, pc1_KE_01_lmr, pc2_KE_01_lmr)


write.csv(df_oi_lmr, (paste0(data_path, "opportunity_KE_by_lmr.csv")), row.names=FALSE)


#-------------------------------------------------------------------------------# 
##### 8.3 Merge LMR index to main data #####
#-------------------------------------------------------------------------------# 

# Recode kkz_rek as numeric for merge
soepoi_v4$krs <- as.numeric(soepoi_v4$kkz_rek)

# LMR OI
oi_lmr <- read.csv((paste0(data_path, "opportunity_KE_by_lmr.csv")))

# Merge lmr oi
soepoi_v4 <- left_join(soepoi_v4, oi_lmr, by = "krs")

# Generate lags
soepoi_v4 = soepoi_v4[order(soepoi_v4$pid, soepoi_v4$syear), ]

soepoi_v4 <- soepoi_v4 %>%
  group_by(pid) %>%
  dplyr::mutate(lmr_lag1_ub = dplyr::lag(lmr_cd, n=1, default = NA),
                krs_lag1_ub = dplyr::lag(krs, n=1, default = NA)
  ) %>%
  as.data.frame() %>%
  ungroup()

# Generate moved (unabalnced) variables
soepoi_v4$lmr_moved_ub <- ifelse(soepoi_v4$lmr_cd != soepoi_v4$lmr_lag1_ub & !is.na(soepoi_v4$lmr_lag1_ub), 1, 0 )
soepoi_v4$lmr_moved_ub[is.na(soepoi_v4$lmr_lag1_ub)] <- NA

soepoi_v4$krs_moved_ub <- ifelse(soepoi_v4$krs != soepoi_v4$krs_lag1_ub & !is.na(soepoi_v4$krs_lag1_ub), 1, 0 )
soepoi_v4$krs_moved_ub[is.na(soepoi_v4$krs_lag1_ub)] <- NA


#-------------------------------------------------------------------------------# 
##### 8.4 Gen opmove var (unbalanced) #####
#-------------------------------------------------------------------------------# 

# Calculate index change (oichange)

## Lag variable for descriptives (unbalanced)

soepoi_v4 = soepoi_v4[order(soepoi_v4$pid, soepoi_v4$syear), ]

soepoi_v4 <- soepoi_v4 %>%
  group_by(pid) %>%
  dplyr::mutate(oiw_lag1_ub_lmr = dplyr::lag(opportunity_KE_pc01_lmr, n=1, default = NA)) %>%
  as.data.frame() %>%
  ungroup()

# Create oichange
soepoi_v4$oichange_ub_lmr <- (soepoi_v4$opportunity_KE_pc01_lmr - soepoi_v4$oiw_lag1_ub_lmr)

# Create opmove
soepoi_v4$opmove_ub_lmr <- NA
soepoi_v4$opmove_ub_lmr[soepoi_v4$oichange_ub_lmr > 0] <- 1 # up
soepoi_v4$opmove_ub_lmr[soepoi_v4$oichange_ub_lmr == 0] <- 2 # stable
soepoi_v4$opmove_ub_lmr[soepoi_v4$oichange_ub_lmr < 0] <- 3 # down

# Assign value labels
val_labels(soepoi_v4$opmove_ub_lmr) <- c("Up" = 1, "Stable" = 2, "Down" = 3)

#-------------------------------------------------------------------------------# 
#### D | Final sample selection ####
#-------------------------------------------------------------------------------# 

# Final Version # 
df <- subset(soepoi_v4, netto >= 10 & netto <= 18 & age >= 18 & age <= 70 & syear >= 2009)

# Distance
summary(df$distance_km[df$moved_krs_bal == 1])

#-------------------------------------------------------------------------------# 
### IV (Opportunity index) ###
#-------------------------------------------------------------------------------# 

# Rename index variables to avoid adjustment #
df <-  rename(df, opportunity_pc01_wn = opportunity_KE_pc01_wn)
df$opportunity_pc01_wn

# opportunity index value (+ 3 lags)
table(df$opportunity_pc01_wn)

# Rename to previous 
df$pc1_01_wn <- df$pc1_KE_01_wn

# Rename to previous 
df$pc2_01_wn <- df$pc2_KE_01_wn

#-------------------------------------------------------------------------------# 

# Set reference categories of some categorical variables #
df$hhtype <- relevel(as.factor(df$hhtype), ref = 5) # --> other HH-type = 5

df$opmove <- relevel(as.factor(df$opmove), ref = 2) # --> in school = 0


#-------------------------------------------------------------------------------# 
#### Save dataset ####
#-------------------------------------------------------------------------------# 

# Create final datasets for 2009, 2010 (main), and 2014 needed for analysis

df_2009 <- df

save(df_2009, file = (paste0(data_path, "df_2009_soepoi_v4.RData")))

#-------------------------------------------------------------------------------# 
# Clear workspace
rm(list=ls())
#-------------------------------------------------------------------------------# 


